<!DOCTYPE html>
<html>

<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="A novel philosophy to eliminating LLM watermarks in existing texts.">
  <meta property="og:title" content="Lost in Overlap: Exploring Logit-based Watermark Collision in LLMs" />
  <meta property="og:description" content="A novel philosophy to eliminating LLM watermarks in existing texts." />
  <meta property="og:url" content="https://ainnovatelab.github.io/watermark-collision/" />
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image"
    content="https://ainnovatelab.github.io/watermark-collision/static/images/watermark_collision-intro.jpg" />
  <meta property="og:image:type" content="image/jpeg" />
  <meta property="og:image:width" content="373" />
  <meta property="og:image:height" content="212" />


  <meta name="twitter:title" content="Lost in Overlap: Exploring Logit-based Watermark Collision in LLMs">
  <meta name="twitter:description" content="A novel philosophy to eliminating LLM watermarks in existing texts.">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image"
    content="https://ainnovatelab.github.io/Watermark-Collision/static/images/watermark_collision-intro.jpg">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="watermark,watermarking,LLM,language model,NLP">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>Lost in Overlap: Exploring Logit-based Watermark Collision in LLMs</title>
  <link rel="icon" type="image/x-icon" href="static/images/favicon.ico">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script>
    MathJax = {
      tex: {
        inlineMath: [['$', '$'], ['\\(', '\\)']]
      }
    }
  </script>
  <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
  <script src="static/js/index.js"></script>
</head>

<body>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-2 publication-title">Lost in Overlap: Exploring Logit-based Watermark Collision in LLMs
            </h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                <a href="https://mrtater.github.io/" target="_blank">Yiyang Luo</a><sup>*,1</sup>,</span>
              <span class="author-block">
                <a href="https://leonardodalinky.github.io/researcher" target="_blank">Ke Lin</a><sup>*,2</sup>,</span>
              <span class="author-block">
                <a href="https://guch8017.github.io/researcher" target="_blank">Chao Gu</a><sup>*,3</sup></span>
              <span class="author-block">
                <a href="https://scholar.google.com/citations?user=z2gblvQAAAAJ&hl=en" target="_blank">Jiahui
                  Hou</a><sup>3</sup>,</span>
              <span class="author-block">
                <a href="https://www.thss.tsinghua.edu.cn/en/faculty/lijiewen.htm" target="_blank">Lijie
                  Wen</a><sup>2</sup>,</span>
              <span class="author-block">
                <a href="https://www.thss.tsinghua.edu.cn/en/faculty/pingluo.htm" target="_blank">Ping
                  Luo</a><sup>2</sup>,</span>
            </div>

            <div class="is-size-5 publication-authors">
              <span class="author-block"><sup>1</sup>Nanyang Technological University,</span>
              <span class="author-block"><sup>2</sup>Tsinghua University,</span>
              <span class="author-block"><sup>3</sup>University of Science and Technology of China</span>
              <br>
              <span class="author-block">NAACL '25 Findings</span>
              <span class="eql-cntrb"><small><br><sup>*</sup>Equal Contribution</small></span>
            </div>

            <div class="column has-text-centered">
              <div class="publication-links">
                <!-- Arxiv PDF link -->
                <span class="link-block">
                  <a href="https://arxiv.org/pdf/2403.10020" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                    </span>
                    <span>Paper</span>
                  </a>
                </span>

                <!-- Github link -->
                <span class="link-block">
                  <a href="https://github.com/AInnovateLab/watermark-collision" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>

                <!-- ArXiv abstract Link -->
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2403.10020" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="ai ai-arxiv"></i>
                    </span>
                    <span>arXiv</span>
                  </a>
                </span>
              </div>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>


  <!-- Teaser video-->
  <!-- <section class="hero teaser">
    <div class="container is-max-desktop">
      <div class="hero-body">
        <video poster="" id="tree" autoplay controls muted loop height="100%">
          <source src="static/videos/banner_video.mp4" type="video/mp4">
        </video>
        <h2 class="subtitle has-text-centered">
          Aliquam vitae elit ullamcorper tellus egestas pellentesque. Ut lacus tellus, maximus vel lectus at, placerat
          pretium mi. Maecenas dignissim tincidunt vestibulum. Sed consequat hendrerit nisl ut maximus.
        </h2>
      </div>
    </div>
  </section> -->
  <!-- End teaser video -->

  <!-- Paper abstract -->
  <section class="section hero is-light">
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-4 has-text-centered">Abstract</h2>
          <div class="content has-text-justified">
            <p>
              The proliferation of large language models (LLMs) in generating content raises concerns about text
              copyright.
              Watermarking methods, particularly logit-based approaches, embed imperceptible identifiers into text to
              address these challenges.
              However, the widespread usage of watermarking across diverse LLMs has led to an inevitable issue known as
              watermark collision during common tasks, such as paraphrasing or translation.
              In this paper, we introduce watermark collision as a novel and general philosophy for watermark attacks,
              aimed at enhancing attack performance on top of any other attacking methods.
              We also provide a comprehensive demonstration that watermark collision poses a threat to all logit-based
              watermark algorithms, impacting not only specific attack scenarios but also downstream applications.
            </p>
          </div>
        </div>
      </div>
    </div>
  </section>
  <!-- End paper abstract -->


  <!-- Image carousel -->
  <!-- <section class="hero is-small">
    <div class="hero-body">
      <div class="container">
        <div id="results-carousel" class="carousel results-carousel">
          <div class="item">
            <img src="static/images/carousel1.jpg" alt="MY ALT TEXT" />
            <h2 class="subtitle has-text-centered">
              First image description.
            </h2>
          </div>
          <div class="item">
            <img src="static/images/carousel2.jpg" alt="MY ALT TEXT" />
            <h2 class="subtitle has-text-centered">
              Second image description.
            </h2>
          </div>
          <div class="item">
            <img src="static/images/carousel3.jpg" alt="MY ALT TEXT" />
            <h2 class="subtitle has-text-centered">
              Third image description.
            </h2>
          </div>
          <div class="item">
            <img src="static/images/carousel4.jpg" alt="MY ALT TEXT" />
            <h2 class="subtitle has-text-centered">
              Fourth image description.
            </h2>
          </div>
        </div>
      </div>
    </div>
  </section> -->
  <!-- End image carousel -->




  <!-- Youtube video -->
  <!-- <section class="hero is-small is-light">
    <div class="hero-body">
      <div class="container">
        <h2 class="title is-3">Video Presentation</h2>
        <div class="columns is-centered has-text-centered">
          <div class="column is-four-fifths">

            <div class="publication-video">
              <iframe src="https://www.youtube.com/embed/JkaxUblCGz0" frameborder="0" allow="autoplay; encrypted-media"
                allowfullscreen></iframe>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section> -->
  <!-- End youtube video -->


  <!-- Video carousel -->
  <!-- <section class="hero is-small">
    <div class="hero-body">
      <div class="container">
        <h2 class="title is-3">Another Carousel</h2>
        <div id="results-carousel" class="carousel results-carousel">
          <div class="item item-video1">
            <video poster="" id="video1" autoplay controls muted loop height="100%">
              <source src="static/videos/carousel1.mp4" type="video/mp4">
            </video>
          </div>
          <div class="item item-video2">
            <video poster="" id="video2" autoplay controls muted loop height="100%">
              <source src="static/videos/carousel2.mp4" type="video/mp4">
            </video>
          </div>
          <div class="item item-video3">
            <video poster="" id="video3" autoplay controls muted loop height="100%">\
              <source src="static/videos/carousel3.mp4" type="video/mp4">
            </video>
          </div>
        </div>
      </div>
    </div>
  </section> -->
  <!-- End video carousel -->

  <section class="section hero is-small">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column is-full">
          <h2 class="title is-4 has-text-centered">Summary</h2>
          <div class="content">
            <div class="has-text-justified">
              <img src="static/images/watermark_collision-intro.svg" alt="Overview of Watermark Collisions."
                class="center-image" width="50%" />
              <br>
              <p>
                In summary, this paper proposes <b>a new watermark attack philosophy for all logit-based watermarks</b>
                in LLMs. Our contributions are as follows:
              <ul>
                <li>We propose a novel philosophy for watermark attacks that can effectively remove existing watermarks
                  from text. This approach can be integrated with various traditional attack methods to enhance their
                  performance.
                </li>
                <li>We find that the strength of overlapping watermarks impacts detection performance. Upstream and
                  downstream watermarks generally compete for detection accuracy, with one being stronger and the other
                  weaker.</li>
                <li>We discuss the vulnerability of watermarking techniques caused by watermark collisions.</li>
              </ul>
              </p>
            </div>
          </div>

        </div>
      </div>
    </div>
    </div>
  </section>


  <section class="hero is-small is-light">
    <div class="hero-body">
      <div class="container  is-max-desktop">
        <h2 class="title is-4 has-text-centered">Methodology</h2>
        <div class="has-text-justified">
          <img src="static/images/watermark_collision-pipeline.svg" alt="Overview pipeline of Watermark Collisions."
            class="center-image" width="95%" />
          <br>
          <p>
            To prove the existence of watermark collisions, we design pipelines with three main components:
            <b>watermarker</b>, <b>colliders</b>, <b>detectors</b>:
          <ul>
            <li><b>Watermarker</b> $W$ generates watermarked texts $T_W$ by using a language model (LM) to create
              content based on a specific corpus as context. As illustrated in the pipeline, we first produce
              the watermarked text data $T_W$ with <em>Watermarker</em> $W$. Additionally, we generate unwatermarked
              text
              $T_{W'}$ using the same context and prompt as $T_W$ for further comparisons. </li>
            <li><b>Colliders</b> $C$ are designed to attack the watermark created by the <em>watermarker</em> using
              collision techniques. There are three distinct <em>colliders</em> that apply such collision attacks
              through
              traditional attack methods, namely paraphraser, back-translator, and mask-and-filler.
              <ul>
                <li><em>Paraphraser</em> $P$ rephrases the watermarked texts $T_W$ with different watermarks, i.e.
                  generated by different methods or keys, to generate paraphrased text data $T_P$, which are intended to
                  contain dual watermarks simultaneously. Furthermore, we also generate texts $T_P'$ using the same
                  paraphraser but without a watermark, denoted as $P'$, for further comparison.</li>
                <li><em>Translator</em> $R$ translates the watermarked texts $T_W$ to other languages and then
                  translates back to their original language with watermarks.</li>
                <li><em>Mask-and-filler</em> (MnF) $M$ is specifically designed for mask-and-fill attacks. The MnF
                  attack method is commonly used with masked language models, e.g., BERT-based models.</li>
              </ul>
            </li>
            <li><b>Detector</b> $D_P$ targets watermarks in paraphrasers, $D_R$ focuses on those in
              translators, and $D_M$ is for watermarks in the MnF process. Detector $D_W$ aims to identify the original
              watermark embedded by the watermarker. By comparing the results from these detectors, we can assess the
              effectiveness of the attacks with or without additional watermarks.</li>
          </ul>
          </p>
        </div>
      </div>
    </div>
    </div>
  </section>



  <section class="section hero is-small">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column is-full">
          <div class="content">
            <h2 class="title is-4 has-text-centered">Experiments</h2>
            <img src="static/images/table1.png" alt="Table 1" class="center-image" width="95%" />
            <br>
            <div class="has-text-centered">
              <strong><em>TPR of the paraphrased text $T_P$ with dual watermarks when $\text{FPR}=1\%$</em></strong>
            </div>
            <p>$W$ and $P$ represent the watermarker and paraphraser, respectively.
              $D_W$ and $D_P$ represent the detector of the watermarker and paraphraser. $\varnothing$ indicates that no
              paraphrasing process is applied to the text, and its corresponding column represents the result of using
              $D_W$ to detect watermark $W$ in $T_W$.
              $P'$ represents paraphrasing $T_W$ without watermark</p>
            <hr />
            <img src="static/images/table2.png" alt="Table 2" class="center-image" width="90%" />
            <br>
            <div class="has-text-centered">
              <strong><em>TPR of the back-translated text $T_R$ with dual watermarks when
                  $\text{FPR}=1\%$.</em></strong>
            </div>
            <hr />
            <img src="static/images/table34.png" alt="Table 3 & 4" class="center-image" width="95%" />
            <br>
            <div class="has-text-centered">
              <strong><em>Text Quality</em></strong>
            </div>
            <hr />
            <img src="static/images/figure3.png" alt="Figure 3" class="center-image" width="95%" />
            <br>
            <div class="has-text-centered">
              <strong><em>Multi-round Collisions</em></strong>
            </div>
            <!-- <div class="container mt-4">
              <div class="alert alert-danger" role="alert">
                <strong>Implications:</strong> SoTA LLMs that use LoRA for alignment fine-tuning are vulnerable to
                Pre-FT weight recovery attacks
              </div>
            </div> -->
          </div>
        </div>
      </div>
  </section>

  <!-- <section class="hero is-small is-light">
    <div class="hero-body">
      <div class="container">
        <h2 class="title is-4 has-text-centered">Examples</h2>
        <div id="results-carousel" class="carousel results-carousel">
          <div class="item">
            <img src="static/images/0_predict.png" alt="Prediction 0" />
            <h2 class="subtitle has-text-centered">
              First image description.
            </h2>
          </div>
          <div class="item">
            <img src="static/images/1_predict.png" alt="Prediction 1" />
          </div>
          <div class="item">
            <img src="static/images/2_predict.png" alt="Prediction 2" />
          </div>
          <div class="item">
            <img src="static/images/3_predict.png" alt="Prediction 3" />
          </div>
        </div>
      </div>
    </div>
  </section> -->




  <!-- Paper poster -->
  <!-- <section class="hero is-small is-light">
    <div class="hero-body">
      <div class="container">
        <h2 class="title">Poster</h2>

        <iframe src="static/pdfs/sample.pdf" width="100%" height="550">
        </iframe>

      </div>
    </div>
  </section> -->
  <!--End paper poster -->


  <!--BibTex citation -->
  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>
@article{luo2024lost,
  title={Lost in Overlap: Exploring Watermark Collision in LLMs},
  author={Luo, Yiyang and Lin, Ke and Gu, Chao and Hou, Jiahui and Wen, Lijie and Luo, Ping},
  journal={arXiv preprint arXiv:2403.10020},
  year={2024}
}
      </code></pre>
    </div>
  </section>
  <!--End BibTex citation -->


  <footer class="footer">
    <div class="container">
      <div class="columns is-centered">
        <div class="column is-8">
          <div class="content">
            <p>
              This page was modified by K. Lin using the <a
                href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project
                Page Template</a> which was adopted from the <a href="https://nerfies.github.io"
                target="_blank">Nerfies</a> project page.
              This website is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"
                target="_blank">Creative
                Commons Attribution-ShareAlike 4.0 International License</a>.
            </p>
          </div>
        </div>
      </div>
    </div>
  </footer>

  <!-- Statcounter tracking code -->

  <!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

  <!-- End of Statcounter Code -->

</body>

</html>